The purpose of this script is to plot the SFS for the exome and chr21.
rm(list=ls())
knitr::opts_knit$set(root.dir = '~/OneDrive - University College London/Projects/Ostridge_PanAf/allele_frequencies/')
library(data.table)
options(datatable.fread.datatable=FALSE)
library(ggplot2)
library(tidyr)
library(dplyr)
exome_out_dir="exome/output/"
chr21_out_dir="chr21/output/"
exome.vs.chr21.sfs=function(exome.ac, chr21.ac, subsps=NULL, fixed.sites=F){
subsps=c("", subsps)
for(subsp in subsps){
# Set title according to subspecies
if(subsp==""){title="Total"}
if(subsp=="c"){title="Central"}
if(subsp=="e"){title="Eastern"}
if(subsp=="n"){title="Nigeria-Cameroon"}
if(subsp=="w"){title="Western"}
# Select columns corresponding to the subspecies
subsp.chr21.ac=chr21.ac[, grepl(paste0("^", subsp), names(chr21.ac))]
subsp.exome.ac=exome.ac[, grepl(paste0("^", subsp), names(exome.ac))]
# Calculate chr21 DAF
subsp.chr21.dac=subsp.chr21.ac[, grepl(".dac$", names(subsp.chr21.ac))]
subsp.chr21.aac=subsp.chr21.ac[, grepl(".aac$", names(subsp.chr21.ac))]
subsp.chr21.total.dac=rowSums(subsp.chr21.dac)
subsp.chr21.total.aac=rowSums(subsp.chr21.aac)
subsp.chr21.total.daf=subsp.chr21.total.dac/(subsp.chr21.total.dac+subsp.chr21.total.aac)
# Calculate exome DAF
subsp.exome.dac=subsp.exome.ac[, grepl(".dac$", names(subsp.exome.ac))]
subsp.exome.aac=subsp.exome.ac[, grepl(".aac$", names(subsp.exome.ac))]
subsp.exome.total.dac=rowSums(subsp.exome.dac)
subsp.exome.total.aac=rowSums(subsp.exome.aac)
subsp.exome.total.daf=subsp.exome.total.dac/(subsp.exome.total.dac+subsp.exome.total.aac)
# Remove fixed sites?
if(fixed.sites==F){
subsp.exome.total.daf=subsp.exome.total.daf[subsp.exome.total.daf!=0 & subsp.exome.total.daf!=1]
subsp.chr21.total.daf=subsp.chr21.total.daf[subsp.chr21.total.daf!=0 & subsp.chr21.total.daf!=1]
}
# Plot - normal scale
SFS=ggplot(NULL) +
theme_bw() +
# Density plots (bandwidth set very low to see all the lumps and bumps)
geom_density(aes(x=subsp.chr21.total.daf, col='Chr21'), alpha=0.3, bw=0.0001) +
geom_density(aes(x=subsp.exome.total.daf, col='Exome'), alpha=0.3, bw=0.0001) +
# General
labs(title=paste0(title,": Exome and Chr21 SFS"),x="DAF", y = "Density") +
theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=10)) +
scale_discrete_manual(name='Data', values = c('Exome' = 'red', 'Chr21' = 'blue'), aesthetics = c("colour"))
# Plot - log scale
SFS.log=ggplot(NULL) +
theme_bw() +
# Density plots (bandwidth set very low to see all the lumps and bumps)
geom_density(aes(x=subsp.chr21.total.daf, col='Chr21'), alpha=0.3, bw=0.0001) +
geom_density(aes(x=subsp.exome.total.daf, col='Exome'), alpha=0.3, bw=0.0001) +
# General
scale_y_log10() +
labs(title=paste0(title,": Exome and Chr21 SFS"), x="DAF", y = "Log Density") +
theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=10)) +
scale_discrete_manual(name='Data', values = c('Exome' = 'red', 'Chr21' = 'blue'), aesthetics = c("colour"))
print(SFS)
print(SFS.log)
}
}
subsp_names=list(
'all'='All Subspecies',
'ce'='Central-Eastern',
'c'='Central',
'e'='Eastern',
'n'='Nigeria-Cameroon',
'w'='Western')
all.chr21.ac=read.table(paste0(chr21_out_dir, "/chr21.f7.pops_minInd.6or50pct_missing.pops.0.3/chr21.f7.pops_chr21_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
all.exome.ac=read.table(paste0(exome_out_dir, "/f5.pops_minInd.6or50pct_missing.pops.0.3/f5.pops.all.chrs_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
exome.vs.chr21.sfs(all.exome.ac, all.chr21.ac, subsps=c("c", "e", "n", "w"))
ce.chr21.ac=read.table(paste0(chr21_out_dir, "/chr21.f7.ce.pops_minInd.6or50pct_missing.pops.0.3/chr21.f7.ce.pops_chr21_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
ce.exome.ac=read.table(paste0(exome_out_dir, "/f5.ce.pops_minInd.6or50pct_missing.pops.0.3/f5.ce.pops.all.chrs_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
exome.vs.chr21.sfs(ce.exome.ac, ce.chr21.ac, subsps=c("c", "e"))
n.chr21.ac=read.table(paste0(chr21_out_dir, "/chr21.f7.n.pops_minInd.6or50pct_missing.pops.0/chr21.f7.n.pops_chr21_missing.pops.0.0_pop.allele.counts_minMAC2"), header=T)
n.exome.ac=read.table(paste0(exome_out_dir, "/f5.n.pops_minInd.6or50pct_missing.pops.0/f5.n.pops.all.chrs_missing.pops.0_pop.allele.counts_minMAC2"), header=T)
exome.vs.chr21.sfs(n.exome.ac, n.chr21.ac)
w.chr21.ac=read.table(paste0(chr21_out_dir, "/chr21.f7.w.pops_minInd.6or50pct_missing.pops.0.3/chr21.f7.w.pops_chr21_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
w.exome.ac=read.table(paste0(exome_out_dir, "/f5.w.pops_minInd.6or50pct_missing.pops.0.3/f5.w.pops.all.chrs_missing.pops.0.3_pop.allele.counts_minMAC2"), header=T)
exome.vs.chr21.sfs(w.exome.ac, w.chr21.ac)
sfs.per.population=function(ac, subsps=NULL, fixed.sites=F){
subsps=c("", subsps)
for(subsp in subsps){
# Set title according to subspecies
if(subsp==""){title="Total"}
if(subsp=="c"){title="Central"}
if(subsp=="e"){title="Eastern"}
if(subsp=="n"){title="Nigeria-Cameroon"}
if(subsp=="w"){title="Western"}
# Select columns coresponding to the subspecies
subsp.ac=ac[, grepl(paste0("^", subsp), names(ac))]
subsp.ac=ac[, grepl(paste0("^", subsp), names(ac))]
subsp.dac=subsp.ac[, grepl(".dac$", names(subsp.ac))]
subsp.aac=subsp.ac[, grepl(".aac$", names(subsp.ac))]
subsp.total.dac=rowSums(subsp.dac)
subsp.total.aac=rowSums(subsp.aac)
subsp.total.daf=subsp.total.dac/(subsp.total.dac+subsp.total.aac)
# Remove fixed sites?
if(fixed.sites==F){
subsp.total.daf=subsp.total.daf[subsp.total.daf!=0 & subsp.total.daf!=1]
subsp.total.daf=subsp.total.daf[subsp.total.daf!=0 & subsp.total.daf!=1]
}
# Plot per population
subsp.daf=as.matrix(subsp.dac)/(as.matrix(subsp.dac)+as.matrix(subsp.aac))
# Make into long format (for ggplot)
subsp.daf_long = gather(as.data.frame(subsp.daf))
# Remove pointless suffix of column names
subsp.daf_long$key=gsub("_chr1.dac", "", subsp.daf_long$key)
# Plot - normal
SFS=ggplot(subsp.daf_long) +
theme_bw() +
geom_density(aes(x=value, col=key), alpha=0.3, bw=0.0001)+
labs(title=paste0(title,": SFS"),x="DAF", y = "Density") +
theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=10)) +
scale_discrete_manual(name='Data', values = 1:ncol(subsp.dac), aesthetics = c("colour"))
# Plot - log scale
SFS.log=ggplot(subsp.daf_long) +
theme_bw() +
geom_density(aes(x=value, col=key), alpha=0.3, bw=0.01)+
labs(title=paste0(title,": SFS"),x="DAF", y = "Density") +
theme(plot.title = element_text(hjust = 0.5), legend.text=element_text(size=10)) +
scale_discrete_manual(name='Data', values = 1:ncol(subsp.dac), aesthetics = c("colour")) +
scale_y_log10()
print(SFS)
print(SFS.log)
}
}
NB: errors such as ‘## Warning: Removed 1262248 rows containing non-finite values (stat_density).’ I think are just due to the fact that missing data is given as 0 derived and ancestral allele count and 0/0 is undefined.
sfs.per.population(all.exome.ac, subsps=c("c", "e", "n", "w"))
sfs.per.population(ce.exome.ac, subsps=c("c", "e"))
sfs.per.population(n.exome.ac)
sfs.per.population(w.exome.ac)
sfs.per.population(all.chr21.ac, subsps=c("c", "e", "n", "w"))
sfs.per.population(ce.chr21.ac, subsps=c("c", "e"))
sfs.per.population(n.chr21.ac)
sfs.per.population(w.chr21.ac)
n_bins=20
# Combine datasets
all.chr21.ac$data='chr21'
all.exome.ac$data='exome'
colnames(all.chr21.ac)=gsub("_chr21", "", colnames(all.chr21.ac))
colnames(all.exome.ac)=gsub("_chr1", "", colnames(all.exome.ac))
all.ac=rbind(all.exome.ac, all.chr21.ac)
subsps=c('c', 'e', 'n', 'w')
for(subsp in subsps){
# Select columns corresponding to the subspecies
subsp.ac=all.ac[, grepl(paste0("^", subsp), names(all.ac))]
# Calculate DAF
subsp.dac=subsp.ac[, grepl(".dac$", names(subsp.ac))]
subsp.aac=subsp.ac[, grepl(".aac$", names(subsp.ac))]
subsp.total.dac=rowSums(subsp.dac)
subsp.total.aac=rowSums(subsp.aac)
subsp.total.daf=subsp.total.dac/(subsp.total.dac+subsp.total.aac)
# Save
all.ac$place_holder=NA
colnames(all.ac)[colnames(all.ac)=='place_holder']=paste0(subsp, "_daf")
all.ac[[paste0(subsp, "_daf")]]=subsp.total.daf
}
all.ac$c.e.daf.diff=all.ac$c_daf-all.ac$e_daf
all.ac$c.n.daf.diff=all.ac$c_daf-all.ac$n_daf
all.ac$c.w.daf.diff=all.ac$c_daf-all.ac$w_daf
all.ac$e.n.daf.diff=all.ac$e_daf-all.ac$n_daf
all.ac$e.w.daf.diff=all.ac$e_daf-all.ac$w_daf
all.ac$n.w.daf.diff=all.ac$n_daf-all.ac$w_daf
# Remove all NAs - this is from when DAC and AAC = 0 (i.e. data is missing for a whole subspecies) this is very rare
all.ac=all.ac[complete.cases(all.ac),]
library(dplyr)
freq_df.out=NULL
for(col in colnames(all.ac)[grepl("daf.diff$", colnames(all.ac))]){
# Bin values
all.ac$bin=cut(all.ac[[col]], breaks = seq(-1, 1, 2/n_bins), include.lowest=T)
freq_df=all.ac %>%
group_by(data, bin) %>%
summarise(frequency = n())
freq_df$total=NA
freq_df[freq_df$data=='chr21', 'total']=sum(freq_df[freq_df$data=='chr21','frequency'])
freq_df[freq_df$data=='exome', 'total']=sum(freq_df[freq_df$data=='exome','frequency'])
# Calculate frequency as a proportion of all SNPs in each dataset
freq_df$proportion=freq_df$frequency/freq_df$total
# Reshape
freq_df=spread(freq_df[c('bin', 'data', 'proportion')], key = data, value = proportion)
# Calculate ratio
freq_df$exome_to_chr21_ratio=freq_df$exome/freq_df$chr21
# Plot
if(col=="c.e.daf.diff"){title="Central DAF - Eastern DAF"}
if(col=="c.n.daf.diff"){title="Central DAF - Nigeria-Cameroon DAF"}
if(col=="c.w.daf.diff"){title="Central DAF - Western DAF"}
if(col=="e.n.daf.diff"){title="Eastern DAF - Nigeria-Cameroon DAF"}
if(col=="e.w.daf.diff"){title="Eastern DAF - Western DAF"}
if(col=="n.w.daf.diff"){title="Nigeria-Cameroon DAF - Eastern DAF"}
plot=ggplot(freq_df, aes(x=bin, y=exome_to_chr21_ratio))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title=title, x="DAF Difference", y="Exome:non-genic-chr21 ratio")+
geom_point() +
geom_hline(yintercept = 1, linetype = "dotted")
print(plot)
freq_df$subsps=col
if(is.null(freq_df.out)){
freq_df.out=freq_df
}else{
freq_df.out=rbind(freq_df.out,freq_df)
}
}
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
ggplot(freq_df.out, aes(x=bin, y=exome_to_chr21_ratio, col=subsps))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title="Subspecies DAF differences", x="DAF Difference", y="Exome:non-genic-chr21 ratio")+
geom_line(aes(group = subsps)) +
geom_point(size=1) +
geom_hline(yintercept = 1, linetype = "dotted")
all.ac$chr_pos=paste(all.ac$chr, all.ac$pos, sep="_")
all.ac[all.ac$chr_pos=='chr11_5254366', grepl('daf', colnames(all.ac))]
## c_daf e_daf n_daf w_daf c.e.daf.diff c.n.daf.diff
## 302217 1 0.6964865 0.916186 0.09307172 0.3035135 0.083814
## c.w.daf.diff e.n.daf.diff e.w.daf.diff n.w.daf.diff
## 302217 0.9069283 -0.2196995 0.6034148 0.8231143
input=list()
input[['all']][['exome']]=all.exome.ac
input[['all']][['chr21']]=all.chr21.ac
input[['ce']][['exome']]=ce.exome.ac
input[['ce']][['chr21']]=ce.chr21.ac
input[['n']][['exome']]=n.exome.ac
input[['n']][['chr21']]=n.chr21.ac
input[['w']][['exome']]=w.exome.ac
input[['w']][['chr21']]=w.chr21.ac
get_ratio=function(out, subsp, freq_df.out, min=0, max=1){
# Remove NA values (that come from missing data where AC=o and so DAC/(AAC+DAC) is undefined)
out=out[complete.cases(out),]
# Bin values
out$bin=cut(out$daf.diff, breaks = seq(min, max, 0.1), include.lowest=T)
freq_df=out %>%
group_by(data, bin) %>%
dplyr::summarise(frequency = n())
freq_df$total=NA
freq_df[freq_df$data=='chr21', 'total']=sum(freq_df[freq_df$data=='chr21','frequency'])
freq_df[freq_df$data=='exome', 'total']=sum(freq_df[freq_df$data=='exome','frequency'])
# Calculate frequency as a proportion of all SNPs in each dataset
freq_df$proportion=freq_df$frequency/freq_df$total
# Reshape
## Keep frequency
freq_df_2=spread(freq_df[c('bin', 'data', 'frequency')], key = data, value = frequency)
colnames(freq_df_2)[colnames(freq_df_2)!='bin']=paste0(colnames(freq_df_2)[colnames(freq_df_2)!='bin'], "_freq")
freq_df=spread(freq_df[c('bin', 'data', 'proportion')], key = data, value = proportion)
freq_df=merge(freq_df, freq_df_2, by='bin')
#freq_df=spread(freq_df[c('bin', 'data', 'frequency')], key = data, value = frequency)
# Calculate ratio
freq_df$exome_to_chr21_ratio=freq_df$exome/freq_df$chr21
# Add to output
freq_df$subsps=subsp
freq_df.out=rbind(freq_df.out,freq_df)
return(freq_df.out)
}
DAF.diff.pops=function(input, max_snps=10000, min=0, max=1){
set.seed(123)
freq_df.out=data.frame("bin"=c(), "chr21"=c(), "exome"=c(), "chr21_freq"=c(), "exome_freq"=c(), "exome_to_chr21_ratio"=c())
freq_df.out.max=data.frame("bin"=c(), "chr21"=c(), "exome"=c(), "chr21_freq"=c(), "exome_freq"=c(), "exome_to_chr21_ratio"=c())
i=1
for(subsp in names(input)){
cat("Subspecies ", i, "/", length(names(input)), "\n")
i=i+1
daf=list()
pair.diff=list()
pair.diff.max=list()
out=data.frame("data"=c(), "daf.diff"=c())
out.max=data.frame("data"=c(), "daf.diff"=c())
for(data in names(input[[subsp]])){
dac=input[[subsp]][[data]][, grepl(".dac$", names(input[[subsp]][[data]]))]
aac=input[[subsp]][[data]][, grepl(".aac$", names(input[[subsp]][[data]]))]
daf[[data]]=as.matrix(dac)/(as.matrix(dac)+as.matrix(aac))
pair.diff[[data]]=c()
pair.diff.max[[data]]=c()
for(snp in sample(nrow(daf[[data]]), min(max_snps, nrow(daf[[data]])))){
if(min<0){
pair.diff.snp=outer(daf[[data]][snp,], daf[[data]][snp,], `-`)
pair.diff.snp=pair.diff.snp[lower.tri(pair.diff.snp)]
}else{
pair.diff.snp=as.numeric(dist(daf[[data]][snp,]))
}
# Max diff
pair.diff.snp.max=max(abs(pair.diff.snp), na.rm = T)
pair.diff[[data]]=c(pair.diff[[data]], pair.diff.snp)
pair.diff.max[[data]]=c(pair.diff.max[[data]], pair.diff.snp.max)
}
out=rbind(out, data.frame("data"=rep(data, length(pair.diff[[data]])), "daf.diff"=pair.diff[[data]]))
out.max=rbind(out.max, data.frame("data"=rep(data, length(pair.diff.max[[data]])), "daf.diff"=pair.diff.max[[data]]))
}
# Function
freq_df.out=get_ratio(out=out, subsp=subsp, freq_df.out=freq_df.out, min=min, max=max)
freq_df.out.max=get_ratio(out=out.max, subsp=subsp, freq_df.out=freq_df.out.max, min=min, max=max)
}
freq_df.out=freq_df.out[complete.cases(freq_df.out),]
freq_df.out$subsps=unlist(subsp_names[freq_df.out$subsps])
freq_df.out.max=freq_df.out.max[complete.cases(freq_df.out.max),]
freq_df.out.max$subsps=unlist(subsp_names[freq_df.out.max$subsps])
# Plot
plot1=ggplot(freq_df.out, aes(x=bin, y=exome_to_chr21_ratio, col=subsps))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title="Pairwise DAF differences between populations", x="Absolute DAF Difference", y="Exome:non-genic-chr21 ratio") +
geom_line(aes(group=subsps)) +
geom_point(size=1) +
geom_hline(yintercept = 1, linetype = "dotted") +
ylim(0, NA)+
scale_discrete_manual(name="Subspecies", aesthetics=c('colour', 'fill'), values = c("All Subspecies"='black', "Central-Eastern"='brown4', "Nigeria-Cameroon"='red', "Western"='blue'))
print(plot1)
plot2=ggplot(freq_df.out.max, aes(x=bin, y=exome_to_chr21_ratio, col=subsps))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title="Maximum DAF differences between populations", x="Absolute DAF Difference", y="Exome:non-genic-chr21 ratio") +
geom_line(aes(group=subsps)) +
geom_point(size=1) +
geom_hline(yintercept = 1, linetype = "dotted")+
ylim(0, NA)+
scale_discrete_manual(name="Subspecies", aesthetics=c('colour', 'fill'), values = c("All Subspecies"='black', "Central-Eastern"='brown4', "Nigeria-Cameroon"='red', "Western"='blue'))
print(plot2)
return(freq_df.out)
}
#out_df=DAF.diff.pops(input, max_snps=20000, min=-1)
out_df=DAF.diff.pops(input, max_snps=20000, min=0)
## Subspecies 1 / 4
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## Subspecies 2 / 4
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## Subspecies 3 / 4
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## Subspecies 4 / 4
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'data'. You can override using the
## `.groups` argument.